Compare pseudobulks generate with Scaden and SimBu simulator¶

QC metrics: Pearson correlation, unsupervised learning

In [1]:
# Import packages

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
import umap
import matplotlib.pyplot as plt

1.1 Load data¶

In [2]:
scaden_sim = pd.read_csv('/ceph/ibmi/it/projects/ML_BI/03_Convolution_tools/Scaden_pseudobulk_simulator/2025-02-01_Simulation_test/2025_02_01_Hao_noTregs_pseudobulks_all_genes.csv', index_col = 0)
simbu_sim = pd.read_csv('/ceph/ibmi/it/projects/ML_BI/03_Convolution_tools/SimBu/Pseudobulk_counts_cpm.csv', index_col = 0)
props = pd.read_csv('/ceph/ibmi/it/projects/ML_BI/03_Convolution_tools/Scaden_pseudobulk_simulator/2025-02-01_Simulation_test/2025_02_01_Hao_noTregs_pseudobulk_proprotions.csv', index_col = 0)
In [3]:
scaden_sim
Out[3]:
MIR1302-2HG FAM138A OR4F5 AL627309.1 AL627309.3 AL627309.2 AL627309.4 AL732372.1 OR4F29 AC114498.1 ... AC007325.2 BX072566.1 AL354822.1 AC023491.2 AC004556.1 AC233755.2 AC233755.1 AC240274.1 AC213203.1 FAM231C
Sample_1 0.0 0.0 0.0 0.370606 0.110511 0.0 0.127872 0.0 0.0 0.0 ... 0.0 0.0 0.593364 0.0 0.984596 0.000000 0.457881 4.829695 0.0 0.0
Sample_2 0.0 0.0 0.0 0.712745 0.114787 0.0 0.095963 0.0 0.0 0.0 ... 0.0 0.0 0.736858 0.0 2.387050 0.000000 0.000000 5.151184 0.0 0.0
Sample_3 0.0 0.0 0.0 0.094767 0.000000 0.0 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.502463 0.0 1.552100 0.000000 0.000000 4.763111 0.0 0.0
Sample_4 0.0 0.0 0.0 0.181507 0.113450 0.0 0.314874 0.0 0.0 0.0 ... 0.0 0.0 0.955879 0.0 3.332111 0.096092 0.568417 5.468141 0.0 0.0
Sample_5 0.0 0.0 0.0 0.667383 0.000000 0.0 0.000000 0.0 0.0 0.0 ... 0.0 0.0 1.738111 0.0 1.610475 1.080704 1.436441 5.671488 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_4996 0.0 0.0 0.0 0.118903 0.000000 0.0 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.936557 0.0 2.663581 0.000000 0.453806 3.953459 0.0 0.0
Sample_4997 0.0 0.0 0.0 0.333447 0.000000 0.0 0.147849 0.0 0.0 0.0 ... 0.0 0.0 1.383900 0.0 1.235414 0.268113 0.248774 4.897303 0.0 0.0
Sample_4998 0.0 0.0 0.0 0.232474 0.000000 0.0 0.106206 0.0 0.0 0.0 ... 0.0 0.0 1.561484 0.0 2.168203 0.000000 0.000000 4.473345 0.0 0.0
Sample_4999 0.0 0.0 0.0 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.0 ... 0.0 0.0 0.759304 0.0 1.106401 0.000000 0.000000 5.440742 0.0 0.0
Sample_5000 0.0 0.0 0.0 0.233797 0.000000 0.0 0.000000 0.0 0.0 0.0 ... 0.0 0.0 1.499544 0.0 1.488148 0.000000 0.000000 5.293614 0.0 0.0

5000 rows × 33538 columns

In [4]:
simbu_sim
Out[4]:
OR4F5 AL627309.1 AL627309.3 AL627309.4 AL669831.2 AL669831.5 FAM87B LINC00115 FAM41C AL645608.7 ... AC007325.1 AC007325.4 AC007325.2 AL354822.1 AC023491.2 AC004556.1 AC233755.2 AC233755.1 AC240274.1 FAM231C
custom_sample1 0.0 0.000000 0.000000 0.163660 0.099780 8.711838 0.000000 2.596957 2.019361 0.0 ... 0.0 0.465367 0.0 1.284496 0.0 1.442526 0.000000 0.425373 5.188903 0.0
custom_sample2 0.0 0.184925 0.000000 0.162402 0.000000 6.590872 0.056287 2.695470 1.264278 0.0 ... 0.0 1.397537 0.0 1.052462 0.0 2.573749 0.000000 0.000000 5.652859 0.0
custom_sample3 0.0 0.315182 0.000000 0.000000 0.000000 7.641985 0.041571 2.828153 0.930387 0.0 ... 0.0 0.840060 0.0 0.774918 0.0 0.955169 0.000000 0.120758 4.246535 0.0
custom_sample4 0.0 0.541012 0.000000 0.000000 0.000000 8.630444 0.000000 3.393205 2.356295 0.0 ... 0.0 1.399480 0.0 1.845916 0.0 3.022844 0.095804 0.539089 5.251889 0.0
custom_sample5 0.0 0.425550 0.234137 0.000000 0.000000 8.210455 0.140766 3.355921 2.004164 0.0 ... 0.0 1.045345 0.0 1.347112 0.0 1.613941 0.114364 0.421398 7.873895 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
custom_sample4996 0.0 0.607859 0.000000 0.000000 0.150617 9.703276 0.038482 5.172569 1.134135 0.0 ... 0.0 1.518119 0.0 1.383370 0.0 3.310197 0.000000 0.529817 4.815675 0.0
custom_sample4997 0.0 0.000000 0.000000 0.000000 0.000000 6.963689 0.140907 3.269562 1.377838 0.0 ... 0.0 0.811047 0.0 1.215111 0.0 2.466429 0.114100 0.373499 5.135589 0.0
custom_sample4998 0.0 0.560628 0.000000 0.000000 0.000000 7.547874 0.087859 3.333674 1.718951 0.0 ... 0.0 1.233451 0.0 0.845520 0.0 2.616630 0.000000 0.000000 5.014159 0.0
custom_sample4999 0.0 0.568575 0.000000 0.000000 0.000000 5.820337 0.000000 2.841114 1.656792 0.0 ... 0.0 0.836029 0.0 1.917313 0.0 2.601889 0.000000 0.000000 4.157759 0.0
custom_sample5000 0.0 0.147645 0.000000 0.000000 0.000000 6.854379 0.000000 2.858554 1.508969 0.0 ... 0.0 1.712113 0.0 0.806658 0.0 0.937638 0.147907 0.179695 3.996304 0.0

5000 rows × 27227 columns

In [5]:
props
Out[5]:
B cells Monocytes NK cells T cells CD4 conv T cells CD8 mDC
Sample_1 0.381764 0.191383 0.013026 0.369739 0.025050 0.019038
Sample_2 0.063190 0.456369 0.028084 0.138415 0.287864 0.026078
Sample_3 0.037111 0.112337 0.023069 0.612839 0.203611 0.011033
Sample_4 0.275827 0.395186 0.017051 0.276830 0.020060 0.015045
Sample_5 0.554664 0.130391 0.002006 0.169509 0.116349 0.027081
... ... ... ... ... ... ...
Sample_4996 0.061122 0.108216 0.242485 0.128257 0.025050 0.434870
Sample_4997 0.353059 0.157472 0.033099 0.263791 0.117352 0.075226
Sample_4998 0.028084 0.298897 0.186560 0.191575 0.165496 0.129388
Sample_4999 0.091274 0.148445 0.070211 0.227683 0.408225 0.054162
Sample_5000 0.073220 0.067202 0.278837 0.007021 0.567703 0.006018

5000 rows × 6 columns

Scaden-based pseudobulks subset to simbu genes:

In [6]:
scaden_sim_reduced = scaden_sim[simbu_sim.columns.tolist()]
scaden_sim_reduced
Out[6]:
OR4F5 AL627309.1 AL627309.3 AL627309.4 AL669831.2 AL669831.5 FAM87B LINC00115 FAM41C AL645608.7 ... AC007325.1 AC007325.4 AC007325.2 AL354822.1 AC023491.2 AC004556.1 AC233755.2 AC233755.1 AC240274.1 FAM231C
Sample_1 0.0 0.370606 0.110511 0.127872 0.0 7.415947 0.115385 3.495553 1.964149 0.0 ... 0.0 0.910253 0.0 0.593364 0.0 0.984596 0.000000 0.457881 4.829695 0.0
Sample_2 0.0 0.712745 0.114787 0.095963 0.0 8.289736 0.044897 4.723462 2.206447 0.0 ... 0.0 1.867092 0.0 0.736858 0.0 2.387050 0.000000 0.000000 5.151184 0.0
Sample_3 0.0 0.094767 0.000000 0.000000 0.0 4.779330 0.000000 2.720801 1.399353 0.0 ... 0.0 1.012640 0.0 0.502463 0.0 1.552100 0.000000 0.000000 4.763111 0.0
Sample_4 0.0 0.181507 0.113450 0.314874 0.0 6.688718 0.072258 4.119207 2.100340 0.0 ... 0.0 0.817438 0.0 0.955879 0.0 3.332111 0.096092 0.568417 5.468141 0.0
Sample_5 0.0 0.667383 0.000000 0.000000 0.0 8.573359 0.000000 3.119849 2.042023 0.0 ... 0.0 1.696065 0.0 1.738111 0.0 1.610475 1.080704 1.436441 5.671488 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_4996 0.0 0.118903 0.000000 0.000000 0.0 9.905708 0.101151 3.092281 1.569958 0.0 ... 0.0 1.133847 0.0 0.936557 0.0 2.663581 0.000000 0.453806 3.953459 0.0
Sample_4997 0.0 0.333447 0.000000 0.147849 0.0 9.724789 0.000000 2.314342 2.010708 0.0 ... 0.0 0.611013 0.0 1.383900 0.0 1.235414 0.268113 0.248774 4.897303 0.0
Sample_4998 0.0 0.232474 0.000000 0.106206 0.0 7.758388 0.000000 4.420318 1.624826 0.0 ... 0.0 1.567479 0.0 1.561484 0.0 2.168203 0.000000 0.000000 4.473345 0.0
Sample_4999 0.0 0.000000 0.000000 0.000000 0.0 6.834526 0.357834 2.419930 0.977676 0.0 ... 0.0 1.945167 0.0 0.759304 0.0 1.106401 0.000000 0.000000 5.440742 0.0
Sample_5000 0.0 0.233797 0.000000 0.000000 0.0 5.853649 0.110003 2.026748 2.808350 0.0 ... 0.0 0.858751 0.0 1.499544 0.0 1.488148 0.000000 0.000000 5.293614 0.0

5000 rows × 27227 columns

In [7]:
# Merge proportions with sample labels for simbu and scaden simulations
props2 = props.copy()
props['SimType'] = ['Scaden-based'] * props.shape[0]
props2.index = simbu_sim.index.tolist()
props2['SimType'] = ['SimBu-based'] * props2.shape[0]
props_merged = pd.concat([props,props2])
props_merged
Out[7]:
B cells Monocytes NK cells T cells CD4 conv T cells CD8 mDC SimType
Sample_1 0.381764 0.191383 0.013026 0.369739 0.025050 0.019038 Scaden-based
Sample_2 0.063190 0.456369 0.028084 0.138415 0.287864 0.026078 Scaden-based
Sample_3 0.037111 0.112337 0.023069 0.612839 0.203611 0.011033 Scaden-based
Sample_4 0.275827 0.395186 0.017051 0.276830 0.020060 0.015045 Scaden-based
Sample_5 0.554664 0.130391 0.002006 0.169509 0.116349 0.027081 Scaden-based
... ... ... ... ... ... ... ...
custom_sample4996 0.061122 0.108216 0.242485 0.128257 0.025050 0.434870 SimBu-based
custom_sample4997 0.353059 0.157472 0.033099 0.263791 0.117352 0.075226 SimBu-based
custom_sample4998 0.028084 0.298897 0.186560 0.191575 0.165496 0.129388 SimBu-based
custom_sample4999 0.091274 0.148445 0.070211 0.227683 0.408225 0.054162 SimBu-based
custom_sample5000 0.073220 0.067202 0.278837 0.007021 0.567703 0.006018 SimBu-based

10000 rows × 7 columns

In [8]:
# Merge scaden and simbu pseudobulks
pseudobulks_merged = pd.concat([scaden_sim_reduced, simbu_sim])
pseudobulks_merged
Out[8]:
OR4F5 AL627309.1 AL627309.3 AL627309.4 AL669831.2 AL669831.5 FAM87B LINC00115 FAM41C AL645608.7 ... AC007325.1 AC007325.4 AC007325.2 AL354822.1 AC023491.2 AC004556.1 AC233755.2 AC233755.1 AC240274.1 FAM231C
Sample_1 0.0 0.370606 0.110511 0.127872 0.000000 7.415947 0.115385 3.495553 1.964149 0.0 ... 0.0 0.910253 0.0 0.593364 0.0 0.984596 0.000000 0.457881 4.829695 0.0
Sample_2 0.0 0.712745 0.114787 0.095963 0.000000 8.289736 0.044897 4.723462 2.206447 0.0 ... 0.0 1.867092 0.0 0.736858 0.0 2.387050 0.000000 0.000000 5.151184 0.0
Sample_3 0.0 0.094767 0.000000 0.000000 0.000000 4.779330 0.000000 2.720801 1.399353 0.0 ... 0.0 1.012640 0.0 0.502463 0.0 1.552100 0.000000 0.000000 4.763111 0.0
Sample_4 0.0 0.181507 0.113450 0.314874 0.000000 6.688718 0.072258 4.119207 2.100340 0.0 ... 0.0 0.817438 0.0 0.955879 0.0 3.332111 0.096092 0.568417 5.468141 0.0
Sample_5 0.0 0.667383 0.000000 0.000000 0.000000 8.573359 0.000000 3.119849 2.042023 0.0 ... 0.0 1.696065 0.0 1.738111 0.0 1.610475 1.080704 1.436441 5.671488 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
custom_sample4996 0.0 0.607859 0.000000 0.000000 0.150617 9.703276 0.038482 5.172569 1.134135 0.0 ... 0.0 1.518119 0.0 1.383370 0.0 3.310197 0.000000 0.529817 4.815675 0.0
custom_sample4997 0.0 0.000000 0.000000 0.000000 0.000000 6.963689 0.140907 3.269562 1.377838 0.0 ... 0.0 0.811047 0.0 1.215111 0.0 2.466429 0.114100 0.373499 5.135589 0.0
custom_sample4998 0.0 0.560628 0.000000 0.000000 0.000000 7.547874 0.087859 3.333674 1.718951 0.0 ... 0.0 1.233451 0.0 0.845520 0.0 2.616630 0.000000 0.000000 5.014159 0.0
custom_sample4999 0.0 0.568575 0.000000 0.000000 0.000000 5.820337 0.000000 2.841114 1.656792 0.0 ... 0.0 0.836029 0.0 1.917313 0.0 2.601889 0.000000 0.000000 4.157759 0.0
custom_sample5000 0.0 0.147645 0.000000 0.000000 0.000000 6.854379 0.000000 2.858554 1.508969 0.0 ... 0.0 1.712113 0.0 0.806658 0.0 0.937638 0.147907 0.179695 3.996304 0.0

10000 rows × 27227 columns

1.2 Using unsupervised learning techniques¶

In [9]:
# Create a plotting function to label samples by simulation technique

from sklearn.decomposition import PCA

# Define the function

def clusterSamples(df, metadata):
    metadata = metadata
    metadata['ID'] = metadata.index.tolist()
    metaData_final = metadata

    fig, axs = plt.subplots(2,2, figsize = (10,10))
    axs[1,1].axis('off')
    # plt.subplots_adjust(hspace=0.15)
    plt.suptitle('Analysis of pseudobulks from Scaden- and SimBu-based simulation', fontweight = 'bold', fontsize = 'x-large')
    plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=0.9, wspace=0.45,hspace=0.4)

    pca = PCA(n_components = 2)
    components = pca.fit_transform(df)
    pca.fit(df)

    labels = metaData_final['SimType']
    X = pd.DataFrame(pd.concat([pd.DataFrame(components, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", "SimType"]
    sns.scatterplot(ax = axs[0,0], x = "Dim1", y = "Dim2", data = X, hue = "SimType", size = 1, legend = False)
    axs[0,0].set_title('PCA')
    axs[0,0].set_xlabel("Dim 1 (explained variance " + str(round(pca.explained_variance_ratio_[0] *100, ndigits=2)) + " %)")
    axs[0,0].set_ylabel("Dim 2 (explained variance " + str(round(pca.explained_variance_ratio_[1] *100, ndigits=2)) + " %)")


    fit = umap.UMAP()
    u = fit.fit_transform(df)

    labels = metaData_final['SimType']
    X = pd.DataFrame(pd.concat([pd.DataFrame(u, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", "SimType"]
    sns.scatterplot(ax = axs[0,1], x = "Dim1", y = "Dim2", data = X, hue = "SimType", size = 1, legend = False)
    axs[0,1].set_title('UMAP')

    u = TSNE(n_components = 2, learning_rate = 'auto', init = 'random', perplexity = 30).fit_transform(df)
    labels = metaData_final['SimType']
    X = pd.DataFrame(pd.concat([pd.DataFrame(u, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", "SimType"]
    sns.scatterplot(ax = axs[1,0], x = "Dim1", y = "Dim2", data = X, hue = "SimType", size = 1)
    sns.move_legend(axs[1,0], "upper left", bbox_to_anchor=(1, 1), ncol = 4, title = 'Sample annotation')
    axs[1,0].set_title('t-SNE')
In [10]:
clusterSamples(pseudobulks_merged, props_merged)
/home/schauerm/miniconda3/envs/Jupyter_new/lib/python3.11/site-packages/numba/np/ufunc/parallel.py:371: NumbaWarning: The TBB threading layer requires TBB version 2021 update 6 or later i.e., TBB_INTERFACE_VERSION >= 12060. Found TBB_INTERFACE_VERSION = 12050. The TBB threading layer is disabled.
  warnings.warn(problem)
No description has been provided for this image
In [11]:
# Function for scattering data and coloring
# Define the function

def clusterCellTypeAbundance(df, metadata, cellType):
    metadata = metadata
    metadata['ID'] = metadata.index.tolist()
    metaData_final = metadata

    fig, axs = plt.subplots(2,2, figsize = (10,10))
    axs[1,1].axis('off')
    # plt.subplots_adjust(hspace=0.15)
    plt.suptitle('Analysis of pseudobulks from Scaden- and SimBu-based simulation for abundance of ' + str(cellType), fontweight = 'bold', fontsize = 'x-large')
    plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=0.9, wspace=0.45,hspace=0.4)

    pca = PCA(n_components = 2)
    components = pca.fit_transform(df)
    pca.fit(df)

    labels = metaData_final[cellType]
    X = pd.DataFrame(pd.concat([pd.DataFrame(components, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", cellType]
    sns.scatterplot(ax = axs[0,0], x = "Dim1", y = "Dim2", data = X, hue = cellType, legend = False, size = 1)
    axs[0,0].set_title('PCA')
    axs[0,0].set_xlabel("Dim 1 (explained variance " + str(round(pca.explained_variance_ratio_[0] *100, ndigits=2)) + " %)")
    axs[0,0].set_ylabel("Dim 2 (explained variance " + str(round(pca.explained_variance_ratio_[1] *100, ndigits=2)) + " %)")


    fit = umap.UMAP()
    u = fit.fit_transform(df)

    labels = metaData_final[cellType]
    X = pd.DataFrame(pd.concat([pd.DataFrame(u, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", cellType]
    sns.scatterplot(ax = axs[0,1], x = "Dim1", y = "Dim2", data = X, hue = cellType, legend = False, size = 1)
    axs[0,1].set_title('UMAP')

    u = TSNE(n_components = 2, learning_rate = 'auto', init = 'random', perplexity = 30).fit_transform(df)
    labels = metaData_final[cellType]
    X = pd.DataFrame(pd.concat([pd.DataFrame(u, index=metaData_final['ID']), pd.DataFrame(labels, index=metaData_final['ID'])], axis = 1, ignore_index=False))
    X.columns = ["Dim1", "Dim2", cellType]
    sns.scatterplot(ax = axs[1,0], x = "Dim1", y = "Dim2", data = X, hue = cellType, size = 1)
    sns.move_legend(axs[1,0], "upper left", bbox_to_anchor=(1, 1), ncol = 4, title = 'Sample annotation')
    axs[1,0].set_title('t-SNE')
In [12]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "B cells")
No description has been provided for this image
In [13]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "NK cells")
No description has been provided for this image
In [14]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "T cells CD8")
No description has been provided for this image
In [15]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "T cells CD4 conv")
No description has been provided for this image
In [16]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "Monocytes")
No description has been provided for this image
In [52]:
clusterCellTypeAbundance(pseudobulks_merged, props_merged, "mDC")
No description has been provided for this image

1.3 Correlation analysis¶

In [17]:
# Define QC functions

from sklearn.metrics import r2_score
import numpy as np

# SVT R2 score (via square root, which is a variance stabilizing transformation)
def r2_score_svt(x_data, y_data):
    x_svt = x_data.transform(lambda x: np.sqrt(x+0.1)) 
    y_svt = y_data.transform(lambda x: np.sqrt(x+0.1))
    r2_svt = r2_score(x_svt,y_svt)
    return r2_svt


# Concordance correlation coefficient
def ccc(x_data,y_data):
    sxy = np.sum((x_data - x_data.mean())*(y_data - y_data.mean()))/x_data.shape[0]
    rhoc = 2*sxy / (np.var(x_data) + np.var(y_data) + (x_data.mean() - y_data.mean())**2)
    return rhoc


# Pearson Correlation Coefficient
def r(x_data,y_data):
    ''' Pearson Correlation Coefficient'''
    sxy = np.sum((x_data - x_data.mean())*(y_data - y_data.mean()))/x_data.shape[0]
    rho = sxy / (np.std(x_data)*np.std(y_data))
    return rho


# RMSE
def rmse(x_data,y_data):
    MSE = np.square((x_data-y_data)).mean()
    RMSE = np.sqrt(MSE)
    return RMSE
In [45]:
# Function to loop over real and corresponding simu samples and perform QC scoring

def QCscoring(df):
    dfQC = df.copy()
    dfQC = dfQC.iloc[0:(int(dfQC.shape[0]/2)),0:4]
    dfQC.columns = ['R', 'RMSE', 'CCC', 'R_svt']
    
    for i in range(0,dfQC.shape[0]):

        dfQC.iloc[i,0] = float(r(df.iloc[i,:], df.iloc[dfQC.shape[0]+i,:]))
        dfQC.iloc[i,1] = float(rmse(df.iloc[i,:], df.iloc[dfQC.shape[0]+i,:]))
        dfQC.iloc[i,2] = float(ccc(df.iloc[i,:], df.iloc[dfQC.shape[0]+i,:]))
        dfQC.iloc[i,3] = float(r2_score_svt(df.iloc[i,:], df.iloc[dfQC.shape[0]+i,:]))
        
    return dfQC
In [46]:
dfQC = QCscoring(pseudobulks_merged)
dfQC
Out[46]:
R RMSE CCC R_svt
Sample_1 0.999943 5.025617 0.999943 0.998412
Sample_2 0.999869 8.151744 0.999822 0.998441
Sample_3 0.999898 7.276777 0.999897 0.998175
Sample_4 0.999843 8.486409 0.999809 0.998197
Sample_5 0.999844 8.437040 0.999843 0.997695
... ... ... ... ...
Sample_4996 0.999515 14.619367 0.999332 0.998454
Sample_4997 0.999814 9.438227 0.999795 0.998218
Sample_4998 0.999836 7.898549 0.999836 0.997932
Sample_4999 0.999932 5.987983 0.999925 0.998101
Sample_5000 0.999938 5.757768 0.999938 0.998168

5000 rows × 4 columns

Visualization of QC metrics¶

In [66]:
# Define a plot to visualize QC metrics

def plotQC(df):

    fig, axs = plt.subplots(1,4, figsize = (20, 5))

    sns.boxplot(ax = axs[0], y = df["R"], fill = None)
    #sns.stripplot(y = df["R"], dodge=True, ax=axs[0], legend=False, size = 4)
    #axs[0].set(ylim=(0.999,1.0))
    axs[0].set_title('R')

    sns.boxplot(ax = axs[1], y = df["CCC"], fill = None)
    #sns.stripplot(y = df["CCC"], dodge=True, ax=axs[1], legend=False, size = 4)
    #axs[1].set(ylim=(0.999,1.0))
    axs[1].set_title('CCC')

    sns.boxplot(ax = axs[2], y = df["R_svt"], fill = None)
    #sns.stripplot(y = df["R_svt"], dodge=True, ax=axs[2], legend=False, size = 4)
    #axs[2].set(ylim=(0.999,1.0))
    axs[2].set_title('R_svt')

    sns.boxplot(ax = axs[3], y = df["RMSE"], fill = None)
    #sns.stripplot(y = df["RMSE"], dodge=True, ax=axs[3], legend=False, size = 4)
    axs[3].set(ylim=(0,25))
    axs[3].set_title('RMSE')
    
In [67]:
plotQC(dfQC)
No description has been provided for this image